{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "from collections import Counter\n",
    "\n",
    "# Load the id_tags mapping\n",
    "with open('id_tags_mapping.json', 'r', encoding='utf-8') as f:\n",
    "    id_tags = json.load(f)\n",
    "\n",
    "tag_counts = Counter()\n",
    "for tags in id_tags.values():\n",
    "    tag_counts.update(tags)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of unique IDs: 34558\n",
      "Total unique tags: 2565\n",
      "\n",
      "Top 10 most common tags:\n",
      "黑发: 6971\n",
      "蓝瞳: 6849\n",
      "棕发: 5053\n",
      "金发: 5012\n",
      "银发: 4832\n",
      "袜子: 4693\n",
      "马尾: 4565\n",
      "红瞳: 4305\n",
      "黑瞳: 4088\n",
      "长发: 4074\n",
      "\n",
      "Total tag occurrences: 325831\n",
      "Average usage per tag: 127.03\n"
     ]
    }
   ],
   "source": [
    "print(f\"Total number of unique IDs: {len(id_tags)}\")\n",
    "print(f\"Total unique tags: {len(tag_counts)}\\n\")\n",
    "\n",
    "# Show the 10 most common tags and their counts\n",
    "print(\"Top 10 most common tags:\")\n",
    "for tag, count in tag_counts.most_common(10):\n",
    "    print(f\"{tag}: {count}\")\n",
    "\n",
    "# Basic statistics\n",
    "total_tags = sum(tag_counts.values())\n",
    "avg_tag_usage = total_tags / len(tag_counts)\n",
    "print(f\"\\nTotal tag occurrences: {total_tags}\")\n",
    "print(f\"Average usage per tag: {avg_tag_usage:.2f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total IDs: 34558\n",
      "IDs with at least one tag from top 50: 33163\n",
      "Percentage covered: 95.96%\n"
     ]
    }
   ],
   "source": [
    "# Get the top 50 tags\n",
    "top_50_tags = set(tag for tag, _ in tag_counts.most_common(50))\n",
    "\n",
    "# Count IDs that have at least one tag from top 50\n",
    "ids_with_top_50 = 0\n",
    "for id_key, tags in id_tags.items():\n",
    "    if any(tag in top_50_tags for tag in tags):\n",
    "        ids_with_top_50 += 1\n",
    "\n",
    "print(f\"IDs with at least one tag from top 50: {ids_with_top_50}\")\n",
    "print(f\"Percentage covered: {(ids_with_top_50 / len(id_tags) * 100):.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total IDs: 34558\n",
      "IDs with at least one tag from top 100: 33806\n",
      "Percentage covered: 97.82%\n"
     ]
    }
   ],
   "source": [
    "# Get the top 100 tags\n",
    "top_100_tags = set(tag for tag, _ in tag_counts.most_common(100))\n",
    "\n",
    "# Count IDs that have at least one tag from top 100\n",
    "ids_with_top_100 = 0\n",
    "for id_key, tags in id_tags.items():\n",
    "    if any(tag in top_100_tags for tag in tags):\n",
    "        ids_with_top_100 += 1\n",
    "\n",
    "print(f\"IDs with at least one tag from top 100: {ids_with_top_100}\")\n",
    "print(f\"Percentage covered: {(ids_with_top_100 / len(id_tags) * 100):.2f}%\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['黑发', '蓝瞳', '棕发', '金发', '银发', '袜子', '马尾', '红瞳', '黑瞳', '长发', '绿瞳', '棕瞳', '金瞳', '学生', '巨乳', '紫瞳', '蓝发', '刘海', '发饰', '萝莉', '长直', '双马尾', '短发', '动物(萌属性)', '过膝袜', '眼镜', '靴子', '紫发', '呆毛', '特殊第一人称', '元气', '红发', 'A型', '傲娇', '粉发', '妹妹', '反差萌', 'O型', '兽耳', '温柔', '高中生', '刀剑', '姐姐', 'B型', '天然呆', '贫乳', '绿发', '兽娘', '手套', '辫子', '连裤袜', '帽子', '腹黑', '尾巴', '御姐', '偶像(萌属性)', 'AB型', '黑长直', '黑色过膝袜', '组织领导人', '麻花辫', '灰瞳', '大小姐', '蝴蝶结', '披风', '卷发', '本名不明', '青梅竹马', '绝对领域', '高马尾', '黑色连裤袜', '橙发', '长鬓角', '粉瞳', '毒舌', '橙瞳', '白色过膝袜', '吃货', '天才', '金发碧眼', '翅膀', '军人', '枪械', '长刘海', '特殊瞳孔', '角', '虚拟UP主', '发箍', '和服', '哥哥', '优等生', '妖怪', '过膝靴', '长靴', '实妹', '机器人', '百合', '神明', '露脐装', '丝带', '乐器(萌属性)', '口癖', '强气', '高跟鞋', '水手服', '齐刘海', '耳饰', '教师', '遮眼发', '人妻', '发夹', '双胞胎', '认真', '忍者', '失忆', '料理达人', '无口', '及膝袜', '吐槽', '挑染', '魔法少女', '国家元首', '君主', '颜艺', '制服', '舰娘', '猫', '机娘', '弟弟', '正太', '治愈系', '虎牙', '遮单眼发', '中分', '第一人称Atashi', '三无', '西装', '实姐', '笨蛋', '歌手(萌属性)', 'S属性', '玩家角色', '第三人称己称', '短裤', '褐色皮肤', '分离袖子', '黑化', '混血儿', '转学生', '下双马尾', '弱气', '变身', '双角', '孤儿', '佣人', '老好人', '短靴', '中长发', '公主', '冰美人', '大叔', '尖耳朵', '美少女', '御宅族', '腿环', '大蝴蝶结', '领带', '姬发式', '女仆', '泳装', '裸足', '丸子头', '连衣裙', '人外', '魔法师', '格斗家', '商人', 'M形刘海', '妈妈', '小天使', '吊带袜', '围巾', '面具', '科学家', '小恶魔系', '猫耳', '超能力者', '异色瞳', '恶魔', '女王(性格)', '侧单马尾', '进气口发型', '歌姬', '糟糕', '头花', '骑士', '伤疤', '日本刀', '巨大武器', '中二病']\n"
     ]
    }
   ],
   "source": [
    "print([x[0] for x in tag_counts.most_common(200)])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_tags = [\n",
    "    '黑发', '蓝瞳', '棕发', '金发', '银发', '马尾', '红瞳', '黑瞳', '长发', '绿瞳', \n",
    "    '棕瞳', '金瞳', '学生', '巨乳', '紫瞳', '蓝发', '刘海', '发饰', '萝莉', '长直', '双马尾', '短发', \n",
    "    '动物(萌属性)', '过膝袜', '眼镜', '靴子', '紫发', '呆毛', '元气', '红发', '傲娇', \n",
    "    '粉发', '妹妹', '反差萌', '兽耳', '温柔', '高中生', '刀剑', '姐姐', '天然呆', '贫乳', '绿发', \n",
    "    '兽娘', '手套', '辫子', '连裤袜', '帽子', '腹黑', '尾巴', '御姐', '偶像(萌属性)', '黑长直', '黑色过膝袜', \n",
    "    '麻花辫', '灰瞳', '大小姐', '蝴蝶结', '披风', '卷发', '本名不明', '青梅竹马', '绝对领域', '高马尾', \n",
    "    '黑色连裤袜', '橙发', '长鬓角', '粉瞳', '毒舌', '橙瞳', '白色过膝袜', '吃货', '天才', '金发碧眼', '翅膀', '军人', \n",
    "    '枪械', '长刘海', '特殊瞳孔', '角', '虚拟UP主', '发箍', '和服', '哥哥', '优等生', '妖怪', '过膝靴', '长靴', '实妹', \n",
    "    '机器人', '百合', '神明', '露脐装', '丝带', '乐器(萌属性)', '口癖', '强气', '高跟鞋', '水手服', '齐刘海', '耳饰', \n",
    "    '教师', '遮眼发', '人妻', '发夹', '双胞胎', '认真', '忍者', '失忆', '料理达人', '无口', '及膝袜', '吐槽', '挑染', \n",
    "    '魔法少女', '颜艺', '制服', '舰娘', '猫', '机娘', '弟弟', '正太', '治愈系', '虎牙', '遮单眼发', '中分', '三无', '西装', \n",
    "    '实姐', '笨蛋', '歌手(萌属性)', 'S属性', '短裤', \n",
    "    '褐色皮肤', '分离袖子', '黑化', '混血儿', '转学生', '下双马尾', '弱气', '变身', '双角', '孤儿', '佣人', '老好人', '短靴', \n",
    "    '中长发', '公主', '冰美人', '大叔', '尖耳朵', '美少女', '御宅族', '腿环', '大蝴蝶结', '领带', '姬发式', '女仆', \n",
    "    '泳装', '裸足', '丸子头', '连衣裙', '人外', '魔法师', '商人', 'M形刘海', '妈妈', '小天使', '吊带袜', '围巾', \n",
    "    '面具', '科学家', '小恶魔系', '猫耳', '超能力者', '异色瞳', '恶魔'\n",
    "    ]\n",
    "selected_tags_set = set(selected_tags)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total IDs: 34558\n",
      "Number of selected tags: 176\n",
      "IDs in filtered mapping: 33876\n",
      "Percentage covered: 98.03%\n"
     ]
    }
   ],
   "source": [
    "# Create a substitution dictionary for tags that need to be renamed\n",
    "tag_substitutions = {\n",
    "    '偶像(萌属性)': '偶像',\n",
    "    '动物(萌属性)': '动物',\n",
    "    '乐器(萌属性)': '乐器',\n",
    "    '歌手(萌属性)': '歌手',\n",
    "}\n",
    "\n",
    "# Create new tag_counts with substitutions\n",
    "new_tag_counts = Counter()\n",
    "for tag, count in tag_counts.items():\n",
    "    # If tag needs substitution, use the substituted version\n",
    "    new_tag = tag_substitutions.get(tag, tag)\n",
    "    new_tag_counts[new_tag] += count\n",
    "\n",
    "# Create new filtered mapping with substitutions and sorted tags\n",
    "filtered_id_tags = {}\n",
    "for id_key, tags in id_tags.items():\n",
    "    # Keep only tags that are in selected_tags_set, applying substitutions where needed\n",
    "    filtered_tags = []\n",
    "    for tag in tags:\n",
    "        if tag in selected_tags_set:\n",
    "            # Apply substitution if it exists, otherwise keep original tag\n",
    "            filtered_tags.append(tag_substitutions.get(tag, tag))\n",
    "    \n",
    "    if filtered_tags:  # Only include IDs that have at least one selected tag\n",
    "        # Sort tags by their frequency in new_tag_counts (descending)\n",
    "        filtered_tags.sort(key=lambda x: new_tag_counts[x], reverse=True)\n",
    "        filtered_id_tags[id_key] = filtered_tags\n",
    "\n",
    "# Print statistics\n",
    "print(f\"Total IDs: {len(id_tags)}\")\n",
    "print(f\"Number of selected tags: {len(selected_tags_set)}\")\n",
    "print(f\"IDs in filtered mapping: {len(filtered_id_tags)}\")\n",
    "print(f\"Percentage covered: {(len(filtered_id_tags) / len(id_tags) * 100):.2f}%\")\n",
    "\n",
    "# Save the filtered mapping to a new file\n",
    "with open('filtered_id_tags_mapping.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(filtered_id_tags, f, ensure_ascii=False, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total IDs: 34558\n",
      "Number of selected tags: 176\n",
      "IDs in filtered mapping: 33876\n",
      "Percentage covered: 98.03%\n"
     ]
    }
   ],
   "source": [
    "# Create a substitution dictionary for tags that need to be renamed\n",
    "tag_substitutions = {\n",
    "    '偶像(萌属性)': '偶像',\n",
    "    '动物(萌属性)': '动物',\n",
    "    '乐器(萌属性)': '乐器',\n",
    "    '歌手(萌属性)': '歌手',\n",
    "}\n",
    "\n",
    "# Create new tag_counts with substitutions\n",
    "new_tag_counts = Counter()\n",
    "for tag, count in tag_counts.items():\n",
    "    # If tag needs substitution, use the substituted version\n",
    "    new_tag = tag_substitutions.get(tag, tag)\n",
    "    new_tag_counts[new_tag] += count\n",
    "\n",
    "# Create new filtered mapping with substitutions, sorted tags, and integer keys\n",
    "filtered_id_tags = {}\n",
    "for id_key, tags in id_tags.items():\n",
    "    # Convert id_key to integer\n",
    "    int_key = int(id_key)\n",
    "    \n",
    "    # Keep only tags that are in selected_tags_set, applying substitutions where needed\n",
    "    filtered_tags = []\n",
    "    for tag in tags:\n",
    "        if tag in selected_tags_set:\n",
    "            # Apply substitution if it exists, otherwise keep original tag\n",
    "            filtered_tags.append(tag_substitutions.get(tag, tag))\n",
    "    \n",
    "    if filtered_tags:  # Only include IDs that have at least one selected tag\n",
    "        # Sort tags by their frequency in new_tag_counts (descending)\n",
    "        filtered_tags.sort(key=lambda x: new_tag_counts[x], reverse=True)\n",
    "        filtered_id_tags[int_key] = filtered_tags\n",
    "\n",
    "# Print statistics\n",
    "print(f\"Total IDs: {len(id_tags)}\")\n",
    "print(f\"Number of selected tags: {len(selected_tags_set)}\")\n",
    "print(f\"IDs in filtered mapping: {len(filtered_id_tags)}\")\n",
    "print(f\"Percentage covered: {(len(filtered_id_tags) / len(id_tags) * 100):.2f}%\")\n",
    "\n",
    "\n",
    "# Save the filtered mapping to a new file\n",
    "with open('filtered_id_tags_mapping.json', 'w', encoding='utf-8') as f:\n",
    "    json.dump(filtered_id_tags, f, ensure_ascii=False, indent=2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "\n",
    "with open('filtered_id_tags_mapping.json', 'r', encoding='utf-8') as f:\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.13.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
